Â
Author: Andrea Ierardi
Â
Repository : Code link
Â
The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student.
Â
library(knitr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(png)
library(ggpubr)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
library(caTools)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(tree)
## Registered S3 method overwritten by 'tree':
## method from
## print.tree cli
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:plotly':
##
## select
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(ranger)
##
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
##
## importance
library(tuneRanger)
## Loading required package: mlrMBO
## Loading required package: mlr
## Loading required package: ParamHelpers
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
##
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
##
## train
## Loading required package: smoof
## Loading required package: checkmate
## Loading required package: parallel
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: lhs
library(keras)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(clustMixType)
library(cluster)
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.13.4
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
##
## rotate
## The following object is masked from 'package:stats':
##
## cutree
library(readr)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)
library(PCAmixdata)
Â
head(ds)
summary(ds)
## id name host_id host_name
## Min. : 2539 Length:48895 Min. : 2438 Length:48895
## 1st Qu.: 9471945 Class :character 1st Qu.: 7822033 Class :character
## Median :19677284 Mode :character Median : 30793816 Mode :character
## Mean :19017143 Mean : 67620011
## 3rd Qu.:29152178 3rd Qu.:107434423
## Max. :36487245 Max. :274321313
##
## neighbourhood_group neighbourhood latitude longitude
## Length:48895 Length:48895 Min. :40.50 Min. :-74.24
## Class :character Class :character 1st Qu.:40.69 1st Qu.:-73.98
## Mode :character Mode :character Median :40.72 Median :-73.96
## Mean :40.73 Mean :-73.95
## 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
##
## room_type price minimum_nights number_of_reviews
## Length:48895 Min. : 0.0 Min. : 1.00 Min. : 0.00
## Class :character 1st Qu.: 69.0 1st Qu.: 1.00 1st Qu.: 1.00
## Mode :character Median : 106.0 Median : 3.00 Median : 5.00
## Mean : 152.7 Mean : 7.03 Mean : 23.27
## 3rd Qu.: 175.0 3rd Qu.: 5.00 3rd Qu.: 24.00
## Max. :10000.0 Max. :1250.00 Max. :629.00
##
## last_review reviews_per_month calculated_host_listings_count
## Length:48895 Min. : 0.010 Min. : 1.000
## Class :character 1st Qu.: 0.190 1st Qu.: 1.000
## Mode :character Median : 0.720 Median : 1.000
## Mean : 1.373 Mean : 7.144
## 3rd Qu.: 2.020 3rd Qu.: 2.000
## Max. :58.500 Max. :327.000
## NA's :10052
## availability_365
## Min. : 0.0
## 1st Qu.: 0.0
## Median : 45.0
## Mean :112.8
## 3rd Qu.:227.0
## Max. :365.0
##
cat("Shape of the dataset:" ,dim(ds))
## Shape of the dataset: 48895 16
img <- readPNG("map.png")
map_ds = ggplot() + background_image(img)+ geom_point(data = ds, aes(y=latitude,x = longitude, color = price))
map_ds
 Â
Â
apply(ds,2,function(x) sum(is.na(x)))
## id name
## 0 0
## host_id host_name
## 0 0
## neighbourhood_group neighbourhood
## 0 0
## latitude longitude
## 0 0
## room_type price
## 0 0
## minimum_nights number_of_reviews
## 0 0
## last_review reviews_per_month
## 0 10052
## calculated_host_listings_count availability_365
## 0 0
Â
dataset = ds %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price)
head(dataset)
Â
scale_data = function(df)
{
df = df %>%filter( price >= 15 & price <= 500)
numerical = c("price")
numerical2 = c("latitude","longitude")
categorical = c("room_type", "neighbourhood_group")
for( cat in categorical )
{
df[cat] = factor(df[[cat]],
level = unique(df[[cat]]),
labels = c(1:length(unique(df[[cat]])) ))
}
df[numerical] = as.numeric(scale(df[numerical]))
df2 = df
df2[numerical2] = as.numeric(scale(df2[numerical2]))
df3 = list()
df3$df2 = df2
df3$df = df
return(df3)
}
dataframe = scale_data(dataset)
dataset = dataframe$df
data = dataframe$df2
head(dataset)
summary(dataset)
## neighbourhood_group latitude longitude room_type
## 1:19858 Min. :40.50 Min. :-74.24 1:22167
## 2:20877 1st Qu.:40.69 1st Qu.:-73.98 2:24503
## 3: 5632 Median :40.72 Median :-73.96 3: 1145
## 4: 366 Mean :40.73 Mean :-73.95
## 5: 1082 3rd Qu.:40.76 3rd Qu.:-73.94
## Max. :40.91 Max. :-73.71
## price
## Min. :-1.3248
## 1st Qu.:-0.7228
## Median :-0.3479
## Mean : 0.0000
## 3rd Qu.: 0.4587
## Max. : 4.1847
Â
mappa = ggplot() + background_image(img)+ geom_point(data = dataset, aes(y=latitude,x = longitude, color = price))
mappa
Â
neighbourhoods = unique(dataset$neighbourhood_group)
rooms = unique(dataset$room_type)
clust_data = vector("list")
lis_n = vector("list")
for (n in neighbourhoods)
{
tmp = dataset %>%filter( neighbourhood_group == n)
lis_n[[n]] = tmp[-1]
tmp2 = data %>%filter( neighbourhood_group == n)
clust_data[[n]] = tmp2[-1]
clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type))
}
lis_r_n= vector("list")
for (n in neighbourhoods)
{
for(r in rooms)
{
tmp = dataset %>%
filter( room_type == r & neighbourhood_group == n)
lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3]
tmp2 = data %>%
filter( room_type == r & neighbourhood_group == n)
clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3]
}
}
data$neighbourhood_group = factor(data$neighbourhood_group , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group))
data$room_type = factor(data$room_type , level = unique(data$room_type ) , labels= unique(ds$room_type))
clust_data[["all"]] = data
Â
trains = vector("list")
tests = vector("list")
datas = vector("list")
for (i in names(lis_n))
{
sample = sample.split(lis_n[[i]], SplitRatio = .75)
train = subset(lis_n[[i]], sample == TRUE)
test = subset(lis_n[[i]], sample == FALSE)
trains[[i]]= train
tests[[i]] = test
datas[[i]] = lis_n[[i]]
}
for (i in names(lis_r_n))
{
sample = sample.split(lis_r_n[[i]], SplitRatio = .75)
train = subset(lis_r_n[[i]], sample == TRUE)
test = subset(lis_r_n[[i]], sample == FALSE)
trains[[i]]= train
tests[[i]] = test
datas[[i]] = lis_r_n[[i]]
}
sample = sample.split(dataset, SplitRatio = .75)
train = subset(dataset, sample == TRUE)
test = subset(dataset, sample == FALSE)
trains[["all"]] = train
tests[["all"]] = test
datas[["all"]] = dataset
Â
Â
model_lis = vector("list")
Â
lin_reg = vector("list")
for (sub in names(trains))
{
lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]])
lin_reg[[sub]]$summary = summary(lm.fit)
lin_reg[[sub]]$pred = pr.lm = predict(lm.fit,tests[[sub]])
lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(paste0("========== ",sub, " =========="))
print(summary(lm.fit))
cat("\n\n")
}
## [1] "========== 1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8345 -0.3464 -0.1178 0.1731 4.9573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -548.77851 20.08969 -27.316 < 2e-16 ***
## latitude 5.37557 0.20584 26.115 < 2e-16 ***
## longitude -4.45432 0.22293 -19.980 < 2e-16 ***
## room_type2 0.97147 0.01122 86.566 < 2e-16 ***
## room_type3 -0.17381 0.03864 -4.499 6.89e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6676 on 14888 degrees of freedom
## Multiple R-squared: 0.3904, Adjusted R-squared: 0.3902
## F-statistic: 2383 on 4 and 14888 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== 2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4027 -0.5349 -0.1876 0.2847 4.7542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -841.73748 58.43516 -14.405 < 2e-16 ***
## latitude -0.51541 0.35505 -1.452 0.147
## longitude -11.65953 0.61678 -18.904 < 2e-16 ***
## room_type2 1.01447 0.01503 67.505 < 2e-16 ***
## room_type3 -0.28394 0.04780 -5.941 2.9e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8812 on 15653 degrees of freedom
## Multiple R-squared: 0.3352, Adjusted R-squared: 0.3351
## F-statistic: 1973 on 4 and 15653 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== 3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3742 -0.2988 -0.1270 0.1391 5.0902
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.94589 12.53702 -0.634 0.52625
## latitude -0.67106 0.28710 -2.337 0.01947 *
## longitude -0.46762 0.20025 -2.335 0.01958 *
## room_type2 0.80929 0.01951 41.478 < 2e-16 ***
## room_type3 -0.17440 0.05165 -3.377 0.00074 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6028 on 4219 degrees of freedom
## Multiple R-squared: 0.3044, Adjusted R-squared: 0.3037
## F-statistic: 461.5 on 4 and 4219 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== 4 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9102 -0.3342 -0.1469 0.2236 3.4561
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.03733 131.03461 -0.313 0.754
## latitude -0.46768 1.38088 -0.339 0.735
## longitude -0.79964 1.23282 -0.649 0.517
## room_type2 0.69732 0.07446 9.365 <2e-16 ***
## room_type3 -0.13585 0.23435 -0.580 0.563
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6009 on 270 degrees of freedom
## Multiple R-squared: 0.2664, Adjusted R-squared: 0.2555
## F-statistic: 24.51 on 4 and 270 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== 5 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9669 -0.2906 -0.1359 0.1124 4.9772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.09595 75.12511 0.827 0.409
## latitude -0.46076 0.89539 -0.515 0.607
## longitude 0.59638 0.72879 0.818 0.413
## room_type2 0.67380 0.04732 14.238 <2e-16 ***
## room_type3 -0.15156 0.09529 -1.591 0.112
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6266 on 806 degrees of freedom
## Multiple R-squared: 0.2199, Adjusted R-squared: 0.216
## F-statistic: 56.79 on 4 and 806 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== n1-r1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7167 -0.2399 -0.0919 0.1142 5.1429
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -347.2980 19.5998 -17.72 <2e-16 ***
## latitude 2.9184 0.2014 14.49 <2e-16 ***
## longitude -3.0816 0.2151 -14.32 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4287 on 6721 degrees of freedom
## Multiple R-squared: 0.04726, Adjusted R-squared: 0.04698
## F-statistic: 166.7 on 2 and 6721 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== n1-r2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9631 -0.5724 -0.2106 0.2963 4.4458
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -764.9500 39.6271 -19.30 <2e-16 ***
## latitude 8.2186 0.4068 20.20 <2e-16 ***
## longitude -5.8264 0.4454 -13.08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8599 on 6238 degrees of freedom
## Multiple R-squared: 0.07359, Adjusted R-squared: 0.07329
## F-statistic: 247.7 on 2 and 6238 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== n1-r3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43709 -0.22328 -0.13609 0.05886 2.72639
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -322.3727 94.7421 -3.403 0.000769 ***
## latitude 2.8413 0.7927 3.584 0.000401 ***
## longitude -2.7841 1.0335 -2.694 0.007504 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 270 degrees of freedom
## Multiple R-squared: 0.05102, Adjusted R-squared: 0.04399
## F-statistic: 7.258 on 2 and 270 DF, p-value: 0.0008505
##
##
##
## [1] "========== n2-r1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.2282 -0.3714 -0.1644 0.1214 4.7343
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -614.1915 82.3469 -7.459 1.02e-13 ***
## latitude -0.8686 0.4705 -1.846 0.065 .
## longitude -8.7780 0.8779 -9.999 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6872 on 5253 degrees of freedom
## Multiple R-squared: 0.1038, Adjusted R-squared: 0.1035
## F-statistic: 304.4 on 2 and 5253 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== n2-r2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4464 -0.6834 -0.2428 0.4327 3.8755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -888.5411 89.7697 -9.898 <2e-16 ***
## latitude -0.9338 0.5683 -1.643 0.1
## longitude -12.5366 0.9398 -13.339 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9986 on 8343 degrees of freedom
## Multiple R-squared: 0.07902, Adjusted R-squared: 0.0788
## F-statistic: 357.9 on 2 and 8343 DF, p-value: < 2.2e-16
##
##
##
## [1] "========== n2-r3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7868 -0.3643 -0.2004 0.0641 4.7188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1051.110 315.024 -3.337 0.000950 ***
## latitude 5.082 1.892 2.686 0.007624 **
## longitude -11.401 3.362 -3.391 0.000786 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.722 on 314 degrees of freedom
## Multiple R-squared: 0.0354, Adjusted R-squared: 0.02925
## F-statistic: 5.761 on 2 and 314 DF, p-value: 0.00349
##
##
##
## [1] "========== n3-r1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5419 -0.2240 -0.0917 0.1024 4.9175
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.2460 12.3016 -1.646 0.0999 .
## latitude -0.1631 0.2852 -0.572 0.5674
## longitude -0.3540 0.1887 -1.876 0.0608 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4257 on 2239 degrees of freedom
## Multiple R-squared: 0.001704, Adjusted R-squared: 0.0008119
## F-statistic: 1.911 on 2 and 2239 DF, p-value: 0.1482
##
##
##
## [1] "========== n3-r2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3955 -0.5444 -0.2012 0.3158 4.1267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.86387 28.50462 1.118 0.264
## latitude -0.65874 0.61449 -1.072 0.284
## longitude 0.06699 0.48209 0.139 0.889
##
## Residual standard error: 0.8305 on 1381 degrees of freedom
## Multiple R-squared: 0.001545, Adjusted R-squared: 9.897e-05
## F-statistic: 1.068 on 2 and 1381 DF, p-value: 0.3438
##
##
##
## [1] "========== n3-r3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5401 -0.2595 -0.1545 0.0127 5.0126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -101.808 90.766 -1.122 0.2641
## latitude 4.512 2.232 2.022 0.0453 *
## longitude 1.122 1.312 0.855 0.3941
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6257 on 126 degrees of freedom
## Multiple R-squared: 0.03195, Adjusted R-squared: 0.01659
## F-statistic: 2.08 on 2 and 126 DF, p-value: 0.1293
##
##
##
## [1] "========== n4-r1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.4399 -0.2616 -0.1008 0.1382 2.7047
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.8452 140.4584 0.298 0.766
## latitude -1.3294 1.5633 -0.850 0.397
## longitude -0.1532 1.3589 -0.113 0.910
##
## Residual standard error: 0.4341 on 123 degrees of freedom
## Multiple R-squared: 0.008143, Adjusted R-squared: -0.007985
## F-statistic: 0.5049 on 2 and 123 DF, p-value: 0.6048
##
##
##
## [1] "========== n4-r2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9720 -0.5710 -0.2809 0.3025 3.5485
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 264.212 293.718 0.900 0.370
## latitude -1.752 3.055 -0.574 0.567
## longitude 2.605 2.687 0.970 0.334
##
## Residual standard error: 0.8744 on 111 degrees of freedom
## Multiple R-squared: 0.00842, Adjusted R-squared: -0.009446
## F-statistic: 0.4713 on 2 and 111 DF, p-value: 0.6254
##
##
##
## [1] "========== n4-r3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## 1 2 4 5 7 8
## 0.153761 -0.025443 -0.290715 0.569389 -0.397762 -0.009229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2919.25 3307.89 -0.883 0.442
## latitude 46.13 21.25 2.171 0.118
## longitude -14.11 37.85 -0.373 0.734
##
## Residual standard error: 0.444 on 3 degrees of freedom
## Multiple R-squared: 0.6387, Adjusted R-squared: 0.3979
## F-statistic: 2.652 on 2 and 3 DF, p-value: 0.2172
##
##
##
## [1] "========== n5-r1 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46134 -0.21757 -0.08786 0.10675 2.68073
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.6199 61.1603 1.547 0.123
## latitude -0.5680 0.7298 -0.778 0.437
## longitude 0.9775 0.6102 1.602 0.110
##
## Residual standard error: 0.3838 on 428 degrees of freedom
## Multiple R-squared: 0.006174, Adjusted R-squared: 0.00153
## F-statistic: 1.329 on 2 and 428 DF, p-value: 0.2657
##
##
##
## [1] "========== n5-r2 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1884 -0.5191 -0.2658 0.1793 4.2156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.3410 175.2402 -0.304 0.761
## latitude 1.6975 2.1824 0.778 0.437
## longitude 0.2177 1.6531 0.132 0.895
##
## Residual standard error: 0.9039 on 248 degrees of freedom
## Multiple R-squared: 0.003215, Adjusted R-squared: -0.004823
## F-statistic: 0.4 on 2 and 248 DF, p-value: 0.6708
##
##
##
## [1] "========== n5-r3 =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34642 -0.21654 -0.11738 0.07302 1.34094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.9124 260.3294 0.192 0.849
## latitude -0.3451 2.9279 -0.118 0.907
## longitude 0.4976 2.3626 0.211 0.834
##
## Residual standard error: 0.3916 on 37 degrees of freedom
## Multiple R-squared: 0.001198, Adjusted R-squared: -0.05279
## F-statistic: 0.02218 on 2 and 37 DF, p-value: 0.9781
##
##
##
## [1] "========== all =========="
##
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1739 -0.4434 -0.1419 0.2269 4.9515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.916e+02 1.420e+01 -13.497 < 2e-16 ***
## neighbourhood_group2 5.023e-01 1.618e-02 31.041 < 2e-16 ***
## neighbourhood_group3 2.216e-01 1.978e-02 11.202 < 2e-16 ***
## neighbourhood_group4 -9.535e-01 5.882e-02 -16.211 < 2e-16 ***
## neighbourhood_group5 2.687e-01 3.914e-02 6.865 6.81e-12 ***
## latitude -1.726e+00 1.393e-01 -12.385 < 2e-16 ***
## longitude -3.532e+00 1.595e-01 -22.145 < 2e-16 ***
## room_type2 9.996e-01 9.622e-03 103.887 < 2e-16 ***
## room_type3 -2.379e-01 3.068e-02 -7.754 9.18e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.784 on 28680 degrees of freedom
## Multiple R-squared: 0.3914, Adjusted R-squared: 0.3912
## F-statistic: 2306 on 8 and 28680 DF, p-value: < 2.2e-16
lin_reg$name = "Linear Regression"
model_lis$linear_regression= lin_reg
Â
dec_tree = vector("list")
for (sub in names(trains))
{
dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]])
dec_tree[[sub]]$summary = sum = summary(tree_res)
print(paste0("========== ",sub, " =========="))
print(sum)
if(sum$size > 1 )
{
plot(tree_res)
text(tree_res,pretty=0)
title(paste0("Tree of: ",sub))
}
else
{
cat("Not possible to plot tree: ", sub)
}
dec_tree[[sub]]$pred = pred = predict(tree_res,tests[[sub]])
dec_tree[[sub]]$MSE = mse = sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(mse)
cat("\n\n")
}
## [1] "========== 1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 4
## Residual mean deviance: 0.4488 = 6683 / 14890
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.9330 -0.3443 -0.1171 0.0000 0.1669 4.8810
## [1] 0.478872
##
##
## [1] "========== 2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 4
## Residual mean deviance: 0.7811 = 12230 / 15650
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.2970 -0.5221 -0.1952 0.0000 0.3185 4.7990
## [1] 0.7822847
##
##
## [1] "========== 3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type"
## Number of terminal nodes: 2
## Residual mean deviance: 0.3647 = 1540 / 4222
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.3900 -0.3014 -0.1245 0.0000 0.1319 4.9310
## [1] 0.3864543
##
##
## [1] "========== 4 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 9
## Residual mean deviance: 0.2809 = 74.73 / 266
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.8840 -0.2813 -0.1205 0.0000 0.2203 2.4320
## [1] 0.4848371
##
##
## [1] "========== 5 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type"
## Number of terminal nodes: 2
## Residual mean deviance: 0.3927 = 317.7 / 809
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9862 -0.2943 -0.1239 0.0000 0.1033 4.9880
## [1] 0.2744778
##
##
## [1] "========== n1-r1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 4
## Residual mean deviance: 0.1783 = 1198 / 6720
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.75440 -0.22280 -0.07515 0.00000 0.11800 4.94600
## [1] 0.1691098
##
##
## [1] "========== n1-r2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 4
## Residual mean deviance: 0.731 = 4559 / 6237
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.9250 -0.5518 -0.2070 0.0000 0.3002 4.0740
## [1] 0.7503308
##
##
## [1] "========== n1-r3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 8
## Residual mean deviance: 0.1442 = 38.21 / 265
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.02700 -0.15190 -0.06911 0.00000 0.04448 2.05000
## [1] 0.2505339
##
##
## [1] "========== n2-r1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 8
## Residual mean deviance: 0.3902 = 2048 / 5248
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.5970 -0.3301 -0.1267 0.0000 0.1243 4.8870
## [1] 0.3882155
##
##
## [1] "========== n2-r2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "longitude"
## Number of terminal nodes: 2
## Residual mean deviance: 1.01 = 8431 / 8344
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3060 -0.7175 -0.2063 0.0000 0.4071 3.8260
## [1] 0.9585893
##
##
## [1] "========== n2-r3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 12
## Residual mean deviance: 0.4186 = 127.7 / 305
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.99000 -0.25310 -0.07929 0.00000 0.11360 3.77600
## [1] 0.7380578
##
##
## [1] "========== n3-r1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 3
## Residual mean deviance: 0.1738 = 389.1 / 2239
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.85200 -0.22640 -0.08418 0.00000 0.11440 4.94200
## [1] 0.1600965
##
##
## [1] "========== n3-r2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 5
## Residual mean deviance: 0.6466 = 891.7 / 1379
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.9740 -0.5074 -0.2091 0.0000 0.2925 4.1950
## [1] 0.6591288
##
##
## [1] "========== n3-r3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 6
## Residual mean deviance: 0.2555 = 31.43 / 123
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.81800 -0.15550 -0.07597 0.00000 0.05964 3.40800
## [1] 0.2989787
##
##
## [1] "========== n4-r1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 7
## Residual mean deviance: 0.1462 = 17.4 / 119
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.81560 -0.18000 -0.03524 0.00000 0.09394 2.13800
## [1] 0.1605553
##
##
## [1] "========== n4-r2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 8
## Residual mean deviance: 0.6078 = 64.43 / 106
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.5870 -0.4486 -0.1186 0.0000 0.2637 2.9850
## [1] 0.4368075
##
##
## [1] "========== n4-r3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes: 1
## Residual mean deviance: 0.3273 = 1.637 / 5
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5301 -0.4165 -0.1609 0.0000 0.2651 0.9466
## Not possible to plot tree: n4-r3[1] 0.09602265
##
##
## [1] "========== n5-r1 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 6
## Residual mean deviance: 0.1351 = 57.44 / 425
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.90420 -0.20040 -0.08683 0.00000 0.08357 2.75300
## [1] 0.2159434
##
##
## [1] "========== n5-r2 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes: 8
## Residual mean deviance: 0.6961 = 169.1 / 243
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.2490 -0.4638 -0.1854 0.0000 0.2079 3.7900
## [1] 0.7622148
##
##
## [1] "========== n5-r3 =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "latitude"
## Number of terminal nodes: 6
## Residual mean deviance: 0.1029 = 3.5 / 34
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.52540 -0.15660 -0.04544 0.00000 0.04203 0.92870
## [1] 0.04892947
##
##
## [1] "========== all =========="
##
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type" "longitude" "latitude"
## Number of terminal nodes: 5
## Residual mean deviance: 0.6082 = 17450 / 28680
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.3090 -0.4252 -0.1440 0.0000 0.2223 4.8800
## [1] 0.6005651
dec_tree$name = "Decision Tree"
model_lis$decision_tree = dec_tree
Â
rf = vector("list")
for (sub in names(trains))
{
rf[[sub]]$fit = res = randomForest( price ~ . , data=trains[[sub]])
rf[[sub]]$pred = predt = predict(res,tests[[sub]])
print(paste0("========== ",sub, " =========="))
print(res)
rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(paste0("MSE: ",mse))
cat("\n\n")
}
## [1] "========== 1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.4275465
## % Var explained: 41.5
## [1] "MSE: 0.455167194530864"
##
##
## [1] "========== 2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.7341265
## % Var explained: 37.13
## [1] "MSE: 0.732725058434287"
##
##
## [1] "========== 3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.3475054
## % Var explained: 33.39
## [1] "MSE: 0.36031765127259"
##
##
## [1] "========== 4 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.3760561
## % Var explained: 22.19
## [1] "MSE: 0.447650149937715"
##
##
## [1] "========== 5 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.3902594
## % Var explained: 21.96
## [1] "MSE: 0.272695971076505"
##
##
## [1] "========== n1-r1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1889355
## % Var explained: 2.03
## [1] "MSE: 0.179037923876284"
##
##
## [1] "========== n1-r2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.7834144
## % Var explained: 1.81
## [1] "MSE: 0.803170390195746"
##
##
## [1] "========== n1-r3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.193599
## % Var explained: 4.44
## [1] "MSE: 0.237242915384785"
##
##
## [1] "========== n2-r1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.4037602
## % Var explained: 23.33
## [1] "MSE: 0.372569290370662"
##
##
## [1] "========== n2-r2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 1.036605
## % Var explained: 4.22
## [1] "MSE: 0.977980628310895"
##
##
## [1] "========== n2-r3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.5747621
## % Var explained: -7.36
## [1] "MSE: 0.728586903061683"
##
##
## [1] "========== n3-r1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1710464
## % Var explained: 5.67
## [1] "MSE: 0.162726364082157"
##
##
## [1] "========== n3-r2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.7131623
## % Var explained: -3.47
## [1] "MSE: 0.65418966508262"
##
##
## [1] "========== n3-r3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.4474675
## % Var explained: -13.3
## [1] "MSE: 0.224070733281232"
##
##
## [1] "========== n4-r1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1859426
## % Var explained: -0.26
## [1] "MSE: 0.142122318467766"
##
##
## [1] "========== n4-r2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.956762
## % Var explained: -27.45
## [1] "MSE: 0.486763656664129"
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## [1] "========== n4-r3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.2507168
## % Var explained: 8.09
## [1] "MSE: 0.106142985149516"
##
##
## [1] "========== n5-r1 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.16317
## % Var explained: -10.89
## [1] "MSE: 0.216914168102416"
##
##
## [1] "========== n5-r2 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.8909938
## % Var explained: -10.02
## [1] "MSE: 0.642336872850853"
##
##
## [1] "========== n5-r3 =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.1977788
## % Var explained: -39.25
## [1] "MSE: 0.0534407934234958"
##
##
## [1] "========== all =========="
##
## Call:
## randomForest(formula = price ~ ., data = trains[[sub]])
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 0.5793331
## % Var explained: 42.61
## [1] "MSE: 0.567668204356476"
rf$name = "Random Forest"
model_lis$random_forest = rf
Â
ranger_rf = vector("list")
for (sub in names(trains))
{
print(paste0("========== ",sub, " =========="))
ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]])
ranger_rf[[sub]]$MSE = mse = sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]])
print(res)
print(paste0("MSE: ",mse))
cat("\n\n")
}
## [1] "========== 1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 14893
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.4250408
## R squared (OOB): 0.418418
## [1] "MSE: 0.451498508118412"
##
##
## [1] "========== 2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 15658
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.7335627
## R squared (OOB): 0.371838
## [1] "MSE: 0.732041957369135"
##
##
## [1] "========== 3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 4224
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.3463108
## R squared (OOB): 0.3363436
## [1] "MSE: 0.359392908585742"
##
##
## [1] "========== 4 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 275
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.3774909
## R squared (OOB): 0.221737
## [1] "MSE: 0.448427295260819"
##
##
## [1] "========== 5 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 811
## Number of independent variables: 3
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.3975296
## R squared (OOB): 0.2060783
## [1] "MSE: 0.274350614878704"
##
##
## [1] "========== n1-r1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 6724
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1891423
## R squared (OOB): 0.01938132
## [1] "MSE: 0.179420027053377"
##
##
## [1] "========== n1-r2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 6241
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.783826
## R squared (OOB): 0.01775623
## [1] "MSE: 0.801841176308322"
##
##
## [1] "========== n1-r3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 273
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1959439
## R squared (OOB): 0.03639428
## [1] "MSE: 0.240502896012614"
##
##
## [1] "========== n2-r1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 5256
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.4018633
## R squared (OOB): 0.2370543
## [1] "MSE: 0.372211157839943"
##
##
## [1] "========== n2-r2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 8346
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 1.036172
## R squared (OOB): 0.04272476
## [1] "MSE: 0.976239444208424"
##
##
## [1] "========== n2-r3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 317
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.5750939
## R squared (OOB): -0.07087803
## [1] "MSE: 0.745111227940539"
##
##
## [1] "========== n3-r1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 2242
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1719263
## R squared (OOB): 0.05225914
## [1] "MSE: 0.161653618970503"
##
##
## [1] "========== n3-r2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 1384
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.7135851
## R squared (OOB): -0.03460708
## [1] "MSE: 0.658703608163795"
##
##
## [1] "========== n3-r3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 129
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.4445584
## R squared (OOB): -0.1168601
## [1] "MSE: 0.240502052536699"
##
##
## [1] "========== n4-r1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 126
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1873397
## R squared (OOB): -0.002135701
## [1] "MSE: 0.140282184958391"
##
##
## [1] "========== n4-r2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 114
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.9489554
## R squared (OOB): -0.2530063
## [1] "MSE: 0.493786919827387"
##
##
## [1] "========== n4-r3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 6
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.2510545
## R squared (OOB): 0.233047
## [1] "MSE: 0.0920501022957262"
##
##
## [1] "========== n5-r1 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 431
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1640681
## R squared (OOB): -0.112398
## [1] "MSE: 0.215429146469569"
##
##
## [1] "========== n5-r2 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 251
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.9027592
## R squared (OOB): -0.1103079
## [1] "MSE: 0.643759831987133"
##
##
## [1] "========== n5-r3 =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 40
## Number of independent variables: 2
## Mtry: 1
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.1944478
## R squared (OOB): -0.3348204
## [1] "MSE: 0.0515924800078338"
##
##
## [1] "========== all =========="
## Ranger result
##
## Call:
## ranger(price ~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
##
## Type: Regression
## Number of trees: 500
## Sample size: 28689
## Number of independent variables: 4
## Mtry: 2
## Target node size: 5
## Variable importance mode: none
## Splitrule: variance
## OOB prediction error (MSE): 0.5497237
## R squared (OOB): 0.4554978
## [1] "MSE: 0.543636080000731"
ranger_rf$name = "Ranger Random Forest"
model_lis$ranger = ranger_rf
Â
build_model <- function(dimension) {
model <- keras::keras_model_sequential() %>%
layer_dense(units = 32, activation = "relu",
input_shape = dimension) %>%
layer_dense(units = 16, activation = "relu") %>%
layer_dense(units = 1,activation="linear")
model %>% compile(
loss = "mse",
optimizer = optimizer_rmsprop(),
metrics = list("mean_absolute_error")
)
return(model)
}
print_dot_callback <- callback_lambda(
on_epoch_end = function(epoch, logs) {
if (epoch %% 1 == 0)
cat(".")
}
)
nn = vector("list")
for (sub in names(trains))
{
d = trains[[sub]]
d2 = tests[[sub]]
len = length(d)
len2 = length(d2)
if(!is.null(d$room_type))
{
d$room_type = keras::to_categorical(d$room_type)
d2$room_type = keras::to_categorical(d2$room_type)
}
if(!is.null(d$neighbourhood_group))
{
d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group)
d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group)
}
target = as.vector(d$price)
features = as.matrix(as_tibble(d[-len]))
target_test = as.vector(d2$price)
features_test = as.matrix(as_tibble(d2[-len]))
nn[[sub]]$epochs = epochs = 30
nn[[sub]]$model = model = build_model(dim(features)[2])
nn[[sub]]$summary = model %>% summary()
nn[[sub]]$history = hist = model %>% fit(
x = features,
y = target,
epochs = epochs,
validation_split = 0.2,
verbose = 0,
callbacks = list(print_dot_callback)
)
eva = model %>% evaluate(features_test,target_test, verbose = 0)
nn[[sub]]$mae = eva[1]
nn[[sub]]$loss = eva[2]
nn[[sub]]$pred = pred = model %>% predict(features_test)
nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test)
}
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense (Dense) (None, 32) 224
## ________________________________________________________________________________
## dense_1 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_2 (Dense) (None, 1) 17
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_3 (Dense) (None, 32) 224
## ________________________________________________________________________________
## dense_4 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_5 (Dense) (None, 1) 17
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_2"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_6 (Dense) (None, 32) 224
## ________________________________________________________________________________
## dense_7 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_8 (Dense) (None, 1) 17
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_3"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_9 (Dense) (None, 32) 224
## ________________________________________________________________________________
## dense_10 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_11 (Dense) (None, 1) 17
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_4"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_12 (Dense) (None, 32) 224
## ________________________________________________________________________________
## dense_13 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_14 (Dense) (None, 1) 17
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_5"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_15 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_16 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_17 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_6"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_18 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_19 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_20 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_7"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_21 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_22 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_23 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_8"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_24 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_25 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_26 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_9"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_27 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_28 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_29 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_10"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_30 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_31 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_32 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_11"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_33 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_34 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_35 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_12"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_36 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_37 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_38 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_13"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_39 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_40 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_41 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_14"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_42 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_43 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_44 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_15"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_45 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_46 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_47 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_16"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_48 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_49 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_50 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_17"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_51 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_52 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_53 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_18"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_54 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_55 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_56 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_19"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_57 (Dense) (None, 32) 96
## ________________________________________________________________________________
## dense_58 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_59 (Dense) (None, 1) 17
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_20"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_60 (Dense) (None, 32) 416
## ________________________________________________________________________________
## dense_61 (Dense) (None, 16) 528
## ________________________________________________________________________________
## dense_62 (Dense) (None, 1) 17
## ================================================================================
## Total params: 961
## Trainable params: 961
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................
nn$name = "Neural Networks"
model_lis$neural_networks = nn
Â
for (sub in names(nn))
{
if(sub != "name")
{
m = nn[[sub]]
hist = m$history
print(paste0("========== ",sub, " =========="))
str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3))
p = plot(hist, y ~ x) + theme_bw(base_size = 12) +ggtitle(paste0("NN of: ",sub))+ labs(caption= str)
print(p)
plot(p)
cat("MAE: ",m$mae)
cat("\nLoss: ",m$loss,"\n\n")
}
}
## [1] "========== 1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.6122801
## Loss: 0.4845496
##
## [1] "========== 2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.8479623
## Loss: 0.6393498
##
## [1] "========== 3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.4926897
## Loss: 0.4206005
##
## [1] "========== 4 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.5653926
## Loss: 0.5909714
##
## [1] "========== 5 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.9475082
## Loss: 0.8239455
##
## [1] "========== n1-r1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.3691125
## Loss: 0.4329883
##
## [1] "========== n1-r2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.857663
## Loss: 0.6326119
##
## [1] "========== n1-r3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.4154404
## Loss: 0.5541389
##
## [1] "========== n2-r1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.4940058
## Loss: 0.4452811
##
## [1] "========== n2-r2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 1.069584
## Loss: 0.8379608
##
## [1] "========== n2-r3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 1.214825
## Loss: 1.012357
##
## [1] "========== n3-r1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.1678099
## Loss: 0.2578662
##
## [1] "========== n3-r2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.6874239
## Loss: 0.5797735
##
## [1] "========== n3-r3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.0545099
## Loss: 0.1592267
##
## [1] "========== n4-r1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.1458065
## Loss: 0.3156902
##
## [1] "========== n4-r2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.3665866
## Loss: 0.492869
##
## [1] "========== n4-r3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.3569993
## Loss: 0.5379468
##
## [1] "========== n5-r1 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.2133904
## Loss: 0.2625024
##
## [1] "========== n5-r2 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 1.036595
## Loss: 0.7100449
##
## [1] "========== n5-r3 =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.07074246
## Loss: 0.2192494
##
## [1] "========== all =========="
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## MAE: 0.6285941
## Loss: 0.5615722
Â
Â
conc = c()
for (n in unique(ds$neighbourhood_group))
{
conc = c(conc,n)
}
for (n in unique(ds$neighbourhood_group))
{
for(r in unique(ds$room_type))
{
conc = c(conc, paste(n,"-",r))
}
}
conc = c(conc,"All")
n = names(nn)
cols = vector("list")
cols[["Subset"]] = conc
for( m in model_lis)
{
col = c()
for( nam in n)
{
if( nam != "name")
{
col = c(col,m[[nam]]$MSE)
}
}
cols[[m$name]] = col
}
mse_df = as.data.frame(cols)
best = c()
for (i in 1:length(cols$Subset))
{
m = min(mse_df[i,2:6])
col = which(mse_df[i,1:6 ] == m)
nam = names(mse_df)[col]
best = c(best, nam)
}
mse_df$Best = best
#kableExtra::kable(mse_df)%>%
# kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>%
## row_spec(0, bold = T, color = "white", background = "#D7261E")%>%
## column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>%
# column_spec(2:6,extra_css="text-align:Center")
mse_df
 Â
Â
Â
clust_num = 5
get_clusters = function(dts, num, dim_plot,name)
{
l = list()
if(is.null(dts$room_type))
{
l$cl = kmeans(dts,num)
}
else
{
l$cl = kproto(dts,num)
}
clust = list()
for (i in 1:num)
{
indexes = l$cl$cluster == i
clust[[i]] = dts[indexes,]
}
min_lat = dim_plot[1]
max_lat = dim_plot[2]
min_long = dim_plot[3]
max_long= dim_plot[4]
myplot= ggplot() + background_image(img)+ xlab('Longitude') + ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name ))
count = 1
l$clust_plot = vector("list")
for(el in clust)
{
myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count)
p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name ))
l$clust_plot[[as.character(count)]] = p
l$summary[[as.character(count)]] = summary(el)
count= count+1
}
l$myplot = myplot
return(l)
}
get_all_cluster = function(clust_data, clust_num, dim)
{
lis = vector("list")
plots = vector("list")
cnt= 1
for(sub in names(clust_data))
{
lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub)
plots[[cnt]] = lis[[sub]]$myplot
cnt = cnt+1
}
lis[["all_plots"]]= plots
return(lis)
}
borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude))
all_cluster = get_all_cluster(clust_data,5, borders)
## # NAs in variables:
## latitude longitude room_type price
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 0.8287906
##
## # NAs in variables:
## latitude longitude room_type price
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.272312
##
## # NAs in variables:
## latitude longitude room_type price
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.701065
##
## # NAs in variables:
## latitude longitude room_type price
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 0.935192
##
## # NAs in variables:
## latitude longitude room_type price
## 0 0 0 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 0.7781472
##
## # NAs in variables:
## neighbourhood_group latitude longitude room_type
## 0 0 0 0
## price
## 0
## 0 observation(s) with NAs.
##
## Estimated lambda: 1.747748
multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) {
require(grid)
plots <- c(plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols),byrow =T)
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
all_cluster$all_plots[[21]]
all_cluster$all$summary
## $`1`
## neighbourhood_group latitude longitude
## Brooklyn :9124 Min. :-2.9118 Min. :-2.813579
## Manhattan : 0 1st Qu.:-1.0252 1st Qu.:-0.337869
## Queens : 809 Median :-0.7689 Median :-0.045750
## Staten Island: 10 Mean :-0.7509 Mean :-0.005138
## Bronx : 0 3rd Qu.:-0.3489 3rd Qu.: 0.307583
## Max. : 0.7754 Max. : 4.680280
## room_type price
## Private room : 294 Min. :-1.32482
## Entire home/apt:9564 1st Qu.:-0.35924
## Shared room : 85 Median : 0.08379
## Mean : 0.18155
## 3rd Qu.: 0.54954
## Max. : 4.18465
##
## $`2`
## neighbourhood_group latitude longitude
## Brooklyn :9691 Min. :-2.9821 Min. :-1.27977
## Manhattan : 0 1st Qu.:-0.9275 1st Qu.:-0.04332
## Queens :3275 Median :-0.6285 Median : 0.35279
## Staten Island: 0 Mean :-0.6364 Mean : 0.72043
## Bronx : 0 3rd Qu.:-0.3030 3rd Qu.: 0.96288
## Max. : 0.9290 Max. : 5.16264
## room_type price
## Private room :11690 Min. :-1.3248
## Entire home/apt: 799 1st Qu.:-0.9499
## Shared room : 477 Median :-0.8136
## Mean :-0.7313
## 3rd Qu.:-0.5864
## Max. : 3.0487
##
## $`3`
## neighbourhood_group latitude longitude
## Brooklyn : 583 Min. :-2.25573 Min. :-3.7011
## Manhattan :10563 1st Qu.:-0.01754 1st Qu.:-0.9270
## Queens : 288 Median : 0.34290 Median :-0.6883
## Staten Island: 9 Mean : 0.37664 Mean :-0.6005
## Bronx : 23 3rd Qu.: 0.69452 3rd Qu.:-0.3315
## Max. : 2.84824 Max. : 3.7225
## room_type price
## Private room : 448 Min. :-0.4160
## Entire home/apt:10975 1st Qu.: 0.4359
## Shared room : 43 Median : 0.8903
## Mean : 1.2141
## 3rd Qu.: 1.8559
## Max. : 4.1847
##
## $`4`
## neighbourhood_group latitude longitude
## Brooklyn : 460 Min. :-4.18085 Min. :-6.3324
## Manhattan :4434 1st Qu.:-0.25722 1st Qu.:-1.0413
## Queens : 25 Median :-0.02293 Median :-0.8472
## Staten Island: 347 Mean :-0.13418 Mean :-1.0240
## Bronx : 0 3rd Qu.: 0.37603 3rd Qu.:-0.7135
## Max. : 0.98828 Max. : 0.1238
## room_type price
## Private room :4126 Min. :-1.3248
## Entire home/apt: 885 1st Qu.:-0.6432
## Shared room : 255 Median :-0.3706
## Mean :-0.3517
## 3rd Qu.:-0.1320
## Max. : 1.9127
##
## $`5`
## neighbourhood_group latitude longitude
## Brooklyn : 0 Min. :0.3207 Min. :-0.681685
## Manhattan :5880 1st Qu.:1.0551 1st Qu.:-0.001138
## Queens :1235 Median :1.4429 Median : 0.206028
## Staten Island: 0 Mean :1.4809 Mean : 0.365540
## Bronx :1059 3rd Qu.:1.8254 3rd Qu.: 0.551628
## Max. :3.3632 Max. : 3.731570
## room_type price
## Private room :5609 Min. :-1.3135
## Entire home/apt:2280 1st Qu.:-0.8250
## Shared room : 285 Median :-0.5864
## Mean :-0.5374
## 3rd Qu.:-0.3592
## Max. : 2.0149
multiplots(all_cluster$all_plot[1:5], cols=3)
## Loading required package: grid
multiplots(all_cluster$all_plot[6:11], cols=3)
multiplots(all_cluster$all_plot[12:17], cols=3)
multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3)
## NULL
## NULL
Â
agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean)
name_hc = c()
for (n1 in substr(unique(agg$neighbourhood_group),1,5))
{
for(n2 in substr(unique(agg$room_type),1,3))
{
name_hc = c(name_hc, paste0(n1,"/",n2))
}
}
rownames(agg) = name_hc
agg
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )
plot(hc1, cex = 0.6, hang = -1)
avg_dend_obj <- as.dendrogram(hc1)
avg_col_dend <- color_branches(avg_dend_obj, h = 0.6)
plot(avg_col_dend)
agg = aggregate(price ~neighbourhood_group, clust_data$all , mean)
agg
rownames(agg) = c("Brooklyn","Manhattan",
"Queens","Staten Island", "Bronx")
agg$neighbourhood_group = NULL
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )
plot(hc1, cex = 0.6, hang = -1)
Â
Â
## Split mixed dataset into quantitative and qualitative variables
#split <- splitmix(dataset[1:5])
split = splitmix(clust_data$all)
## PCA
res.pcamix <- PCAmix(X.quanti=split$X.quanti,
X.quali=split$X.quali,
rename.level=TRUE)
res.pcamix
##
## Call:
## PCAmix(X.quanti = split$X.quanti, X.quali = split$X.quali, rename.level = TRUE)
##
## Method = Principal Component of mixed data (PCAmix)
##
##
## "name" "description"
## "$eig" "eigenvalues of the principal components (PC) "
## "$ind" "results for the individuals (coord,contrib,cos2)"
## "$quanti" "results for the quantitative variables (coord,contrib,cos2)"
## "$levels" "results for the levels of the qualitative variables (coord,contrib,cos2)"
## "$quali" "results for the qualitative variables (contrib,relative contrib)"
## "$sqload" "squared loadings"
## "$coef" "coef of the linear combinations defining the PC"
## Inspect principal components
res.pcamix$eig
## Eigenvalue Proportion Cumulative
## dim 1 2.1303801 23.670890 23.67089
## dim 2 1.8046238 20.051375 43.72227
## dim 3 1.2456603 13.840670 57.56294
## dim 4 1.0028761 11.143067 68.70600
## dim 5 0.9971611 11.079568 79.78557
## dim 6 0.9570021 10.633357 90.41893
## dim 7 0.4201255 4.668061 95.08699
## dim 8 0.2652652 2.947391 98.03438
## dim 9 0.1769058 1.965620 100.00000
res.pcamix$quanti.cor
## dim 1 dim 2 dim 3 dim 4 dim 5
## latitude -0.1972875 0.89428404 -0.2214475 -0.04100129 -0.04272996
## longitude 0.7617791 0.35537824 0.4064136 0.01097103 0.01298250
## price -0.7183816 0.07289152 0.4855170 0.02253704 0.01357485
res.pcamix$quali.eta2
## dim 1 dim 2 dim 3 dim 4 dim 5
## neighbourhood_group 0.6390071 0.871714919 0.4342793 0.5273305 0.4806940
## room_type 0.3560712 0.001558062 0.3614432 0.4732362 0.5142885
Â
FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE) - base : a data frame with n rows (individuals) and p columns (variables). - ncp: the number of dimensions kept in the results (by default 5) - sup.var: a vector indicating the indexes of the supplementary variables. - ind.sup: a vector indicating the indexes of the supplementary individuals. - graph : a logical value. If TRUE a graph is displayed.
res.famd <- FAMD(clust_data$all, graph = F, ncp = 5)
print(res.famd)
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues and inertia"
## 2 "$var" "Results for the variables"
## 3 "$ind" "results for the individuals"
## 4 "$quali.var" "Results for the qualitative variables"
## 5 "$quanti.var" "Results for the quantitative variables"
eig.val <- get_eigenvalue(res.famd)
head(eig.val)
## eigenvalue variance.percent cumulative.variance.percent
## Dim.1 2.1303801 23.67089 23.67089
## Dim.2 1.8046238 20.05138 43.72227
## Dim.3 1.2456603 13.84067 57.56294
## Dim.4 1.0028761 11.14307 68.70600
## Dim.5 0.9971611 11.07957 79.78557
fviz_screeplot(res.famd)
quanti.var <- get_famd_var(res.famd, "quanti.var")
quanti.var
## FAMD results for quantitative variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates"
## 2 "$cos2" "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
fviz_famd_var(res.famd, "quanti.var", col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE)
quali.var <- get_famd_var(res.famd, "quali.var")
quali.var
## FAMD results for qualitative variable categories
## ===================================================
## Name Description
## 1 "$coord" "Coordinates"
## 2 "$cos2" "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
fviz_famd_var(res.famd, "quali.var", col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)
var <- get_famd_var(res.famd)
var
## FAMD results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates"
## 2 "$cos2" "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
# Coordinates of variables
head(var$coord)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## latitude 0.03892236 0.799743946 0.04903898 0.0016811055 0.0018258497
## longitude 0.58030735 0.126293692 0.16517205 0.0001203636 0.0001685452
## price 0.51607212 0.005313173 0.23572672 0.0005079180 0.0001842764
## neighbourhood_group 0.63900709 0.871714919 0.43427934 0.5273304673 0.4806939587
## room_type 0.35607121 0.001558062 0.36144319 0.4732362202 0.5142884950
# Cos2: quality of representation on the factore map
head(var$cos2)
## Dim.1 Dim.2 Dim.3 Dim.4
## latitude 0.00151495 6.395904e-01 0.002404821 2.826116e-06
## longitude 0.33675663 1.595010e-02 0.027281805 1.448739e-08
## price 0.26633043 2.822981e-05 0.055567086 2.579807e-07
## neighbourhood_group 0.10208251 1.899717e-01 0.047149637 6.951936e-02
## room_type 0.06339335 1.213779e-06 0.065320588 1.119763e-01
## Dim.5
## latitude 3.333727e-06
## longitude 2.840749e-08
## price 3.395780e-08
## neighbourhood_group 5.776667e-02
## room_type 1.322463e-01
# Contributions to the dimensions
head(var$contrib)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## latitude 1.827015 44.31638047 3.936786 0.16762844 0.18310478
## longitude 27.239615 6.99833910 13.259799 0.01200184 0.01690250
## price 24.224415 0.29441998 18.923837 0.05064614 0.01848011
## neighbourhood_group 29.994980 48.30452324 34.863386 52.58181750 48.20624738
## room_type 16.713975 0.08633722 29.016193 47.18790608 51.57526523
# Plot of variables
fviz_famd_var(res.famd, repel = TRUE)
# Contribution to the first dimension
fviz_contrib(res.famd, "var", axes = 1)
# Contribution to the second dimension
fviz_contrib(res.famd, "var", axes = 2)
# Contribution to the third dimension
fviz_contrib(res.famd, "var", axes = 3)
# Contribution to the forth dimension
fviz_contrib(res.famd, "var", axes = 4)
# Contribution to the fifth dimension
fviz_contrib(res.famd, "var", axes = 5)